# 데이터 읽어오기
# url이거나 txt파일인 경우 -> read.table
# csv 파일인 경우 -> read.csv
# 엑셀 파일인 경우 -> read.excel
dwaine<-read.table(
"http://www.stat.ufl.edu/~rrandles/sta4210/Rclassnotes/data/textdatasets/KutnerData/Chapter%20%206%20Data%20Sets/CH06FI05.txt")
# 컬럼 지정
names(dwaine)<-c("X1","X2","Y")
# 데이터 확인
# 일부
head(dwaine)
## X1 X2 Y
## 1 68.5 16.7 174.4
## 2 45.2 16.8 164.4
## 3 91.3 18.2 244.2
## 4 47.8 16.3 154.6
## 5 46.9 17.3 181.6
## 6 66.1 18.2 207.5
# 차원
dim(dwaine)
## [1] 21 3
# 요약
summary(dwaine)
## X1 X2 Y
## Min. :38.40 Min. :15.80 Min. :137.2
## 1st Qu.:47.80 1st Qu.:16.30 1st Qu.:152.8
## Median :52.30 Median :17.10 Median :166.5
## Mean :62.02 Mean :17.14 Mean :181.9
## 3rd Qu.:82.70 3rd Qu.:18.10 3rd Qu.:209.7
## Max. :91.30 Max. :19.10 Max. :244.2
pairs(dwaine)
library(ggplot2)
library(GGally)
ggpairs(dwaine)
library(dplyr)
library(plotly)
p <- plot_ly(dwaine, x = ~X1, y = ~X2, z = ~Y,
colors = c('#BF382A')) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'X1'),
yaxis = list(title = 'X2'),
zaxis = list(title = 'Y')))
p
Y<-dwaine$Y
X<-cbind(1,dwaine$X1, dwaine$X2)
bhat<-solve(t(X)%*%X)%*%t(X)%*%Y
bhat
## [,1]
## [1,] -68.85707
## [2,] 1.45456
## [3,] 9.36550
lm.dwaine<- lm(Y~X1+X2,data=dwaine)
summary(lm.dwaine)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = dwaine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.4239 -6.2161 0.7449 9.4356 20.2151
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -68.8571 60.0170 -1.147 0.2663
## X1 1.4546 0.2118 6.868 2e-06 ***
## X2 9.3655 4.0640 2.305 0.0333 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.01 on 18 degrees of freedom
## Multiple R-squared: 0.9167, Adjusted R-squared: 0.9075
## F-statistic: 99.1 on 2 and 18 DF, p-value: 1.921e-10
Yhat<-fitted(lm.dwaine)
Yhat.1<-X%*% bhat
H<-X%*%solve(t(X)%*%X)%*%t(X)
Yhat.2<-H%*%Y
head(cbind(Yhat, Yhat.1, Yhat.2))
## Yhat
## 1 187.1841 187.1841 187.1841
## 2 154.2294 154.2294 154.2294
## 3 234.3963 234.3963 234.3963
## 4 153.3285 153.3285 153.3285
## 5 161.3849 161.3849 161.3849
## 6 197.7414 197.7414 197.7414
plot(Yhat~Y)
plot(Yhat ~ Y,
xlab = "Observed Y",
ylab = "Predicted Y",
pch = 19,
xlim = c(130, 250),
ylim = c(130, 250),
col = "gray30",
cex = 1.5)
abline(0, 1, col = "steelblue", lwd = 1)
grid(nx = NULL, ny = NULL, col = "lightgray", lty = "dashed")
Y-Yhat
## 1 2 3 4 5 6
## -12.7841146 10.1705737 9.8036764 1.2714690 20.2150722 9.7585779
## 7 8 9 10 11 12
## 0.7449178 -4.6666316 -12.3381967 0.3539791 11.5126289 -6.0849209
## 13 14 15 16 17 18
## 9.3142995 3.7815611 -13.1132116 -18.4238900 0.6530062 -15.0013134
## 19 20 21
## 1.6129777 -6.2160615 9.4356009
residuals(lm.dwaine)
## 1 2 3 4 5 6
## -12.7841146 10.1705737 9.8036764 1.2714690 20.2150722 9.7585779
## 7 8 9 10 11 12
## 0.7449178 -4.6666316 -12.3381967 0.3539791 11.5126289 -6.0849209
## 13 14 15 16 17 18
## 9.3142995 3.7815611 -13.1132116 -18.4238900 0.6530062 -15.0013134
## 19 20 21
## 1.6129777 -6.2160615 9.4356009
library(ggfortify)
autoplot(lm.dwaine)
anova(lm.dwaine)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X1 1 23371.8 23371.8 192.8962 4.64e-11 ***
## X2 1 643.5 643.5 5.3108 0.03332 *
## Residuals 18 2180.9 121.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm.dwaine)
##
## Call:
## lm(formula = Y ~ X1 + X2, data = dwaine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.4239 -6.2161 0.7449 9.4356 20.2151
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -68.8571 60.0170 -1.147 0.2663
## X1 1.4546 0.2118 6.868 2e-06 ***
## X2 9.3655 4.0640 2.305 0.0333 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.01 on 18 degrees of freedom
## Multiple R-squared: 0.9167, Adjusted R-squared: 0.9075
## F-statistic: 99.1 on 2 and 18 DF, p-value: 1.921e-10
Define the vector \(X_h\):
\[ X_h = \begin{bmatrix} 1 \\ X_{h,1} \\ \vdots \\ X_{h,p-1} \end{bmatrix}, \quad E\{Y_h\} = X_h^T \beta, \quad \hat{Y}_h = X_h^T b \]
Variance of \(\hat{Y}_h\):
\[ \sigma^2\{\hat{Y}_h\} = \sigma^2 X_h^T (X^T X)^{-1} X_h \]
Estimated variance:
\[ s^2\{\hat{Y}_h\} = MSE \, (X_h^T (X^T X)^{-1} X_h) = X_h^T s^2\{b\} X_h \]
The \((1-\alpha)\) confidence limits for \(E\{Y_h\}\) are:
\[ \hat{Y}_h \; \pm \; t(1-\alpha/2; n-p) \; s\{\hat{Y}_h\} \]
head(predict(lm.dwaine, newdata=dwaine, interval = "confidence"))
## fit lwr upr
## 1 187.1841 179.1146 195.2536
## 2 154.2294 146.7591 161.6998
## 3 234.3963 224.7569 244.0358
## 4 153.3285 146.5361 160.1210
## 5 161.3849 152.0778 170.6921
## 6 197.7414 188.5424 206.9404
predict.lm(lm.dwaine, newdata=data.frame(X1=65.4, X2=17.6), interval="confidence", level=0.95)
## fit lwr upr
## 1 191.1039 185.2911 196.9168
The \((1 - \alpha)\) prediction limits for a new observation \(Y_{h}^{(new)}\) corresponding to \(X_h\), the specified values of the \(X\) variables, are:
\[ \hat{Y}_h \; \pm \; t(1-\alpha/2; \, n-p) \, s\{pred\} \]
where:
\[ s^2\{pred\} = MSE + s^2\{\hat{Y}_h\} = MSE \Big( 1 + X_h^T (X^T X)^{-1} X_h \Big) \]
head(predict(lm.dwaine, newdata=dwaine, interval = "prediction"))
## fit lwr upr
## 1 187.1841 162.6910 211.6772
## 2 154.2294 129.9271 178.5317
## 3 234.3963 209.3421 259.4506
## 4 153.3285 129.2260 177.4311
## 5 161.3849 136.4566 186.3132
## 6 197.7414 172.8533 222.6295
predict.lm(lm.dwaine, newdata=data.frame(X1=65.4, X2=17.6), interval="prediction", level=0.95)
## fit lwr upr
## 1 191.1039 167.2589 214.949